import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import dalex as dx
import pickle
import ast
clf_results_df = pd.read_csv('CV_results.csv', index_col = 0)
used_metrics = ['roc_auc', 'f1', 'accuracy']
clf_results_df.head()
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_nrounds | param_min_child_weight | param_lambda | param_eta | param_colsample_bytree | param_colsample_bylevel | ... | std_test_f1 | rank_test_f1 | split0_test_accuracy | split1_test_accuracy | split2_test_accuracy | split3_test_accuracy | split4_test_accuracy | mean_test_accuracy | std_test_accuracy | rank_test_accuracy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14.686826 | 1.362751 | 0.029591 | 0.000557 | 2222.0 | 5.039684 | 1024.000000 | 0.214311 | 0.666667 | 1.000000 | ... | 0.017854 | 12 | 0.899751 | 0.895225 | 0.900860 | 0.900634 | 0.903576 | 0.900009 | 0.002713 | 7 |
| 1 | 6.655648 | 0.159322 | 0.028604 | 0.003704 | 3889.0 | 43.545280 | 0.462937 | 0.099213 | 0.666667 | 0.666667 | ... | 0.033574 | 28 | 0.887531 | 0.885721 | 0.884563 | 0.888185 | 0.888637 | 0.886927 | 0.001544 | 29 |
| 2 | 2.165086 | 0.115962 | 0.029119 | 0.000515 | 2222.0 | 5.039684 | 0.099213 | 0.004557 | 0.222222 | 0.222222 | ... | 0.005378 | 32 | 0.888210 | 0.886400 | 0.886600 | 0.887053 | 0.886148 | 0.886882 | 0.000727 | 30 |
| 3 | 3.120148 | 0.107695 | 0.034262 | 0.009469 | 1111.0 | 14.813995 | 0.000977 | 0.002109 | 0.444444 | 0.222222 | ... | 0.009058 | 29 | 0.888210 | 0.887531 | 0.886374 | 0.886600 | 0.887053 | 0.887154 | 0.000661 | 28 |
| 4 | 5.744213 | 0.020123 | 0.031193 | 0.003130 | 1.0 | 5.039684 | 0.462937 | 0.002109 | 1.000000 | 0.222222 | ... | 0.018855 | 22 | 0.888889 | 0.889115 | 0.889316 | 0.889995 | 0.892938 | 0.890051 | 0.001490 | 19 |
5 rows × 36 columns
def create_count_df(df, metric, theta):
count = pd.DataFrame(data = {'theta': theta})
count['rashomon_count_for_' + metric] = [(df['mean_test_' + metric] > val).sum() for val in theta]
return count
theta = np.arange(0, 1, 0.0001)
fig, axs = plt.subplots(1, 3, sharey=True, figsize=(16,4))
for i in range(3):
sns.lineplot(data = create_count_df(clf_results_df, used_metrics[i], theta),
x = 'theta', y = 'rashomon_count_for_'+used_metrics[i], ax = axs[i])
axs[i].set_ylabel("Liczba modeli", fontsize = 14)
axs[i].set_xlabel("")
axs[i].set_title(used_metrics[i])
fig.text(0.5, 0.01, "theta", fontsize = 14)
fig.suptitle('Liczba modeli które osiągnęły wynik lepszy niż theta\n przy podanych metrykach', y = 1.1, fontsize = 16)
plt.show()
clf_results_df.mean_test_accuracy.max() - clf_results_df.mean_test_accuracy.min() # ciekawe
0.027068726949791233
clf_results_df.mean_test_f1.max() - clf_results_df.mean_test_f1.min()
0.5313490826292734
clf_results_df.mean_test_roc_auc.max() - clf_results_df.mean_test_roc_auc.min()
0.24337631442410068
def get_k_best(df, metric, k):
return set(df.sort_values(by = 'mean_test_' + metric, ascending = False)[:k].index)
used_metrics
['roc_auc', 'f1', 'accuracy']
get_k_best(clf_results_df, used_metrics[0], 10)
{0, 8, 16, 17, 19, 38, 41, 43, 44, 47}
indexes_of_best = set.intersection(get_k_best(clf_results_df, used_metrics[0], 10),
get_k_best(clf_results_df, used_metrics[1], 10),
get_k_best(clf_results_df, used_metrics[2], 10))
best_params = clf_results_df.iloc[pd.Index(indexes_of_best),:]['params']
best_params.reset_index(drop = True, inplace = True)
for params in best_params:
print(params, "\n")
{'nrounds': 1.0, 'min_child_weight': 14.813995396596646, 'lambda': 1024.0, 'eta': 0.46293735614364534, 'colsample_bytree': 0.6666666666666666, 'colsample_bylevel': 0.2222222222222222, 'alpha': 0.004556754060844206}
{'nrounds': 1667.0, 'min_child_weight': 25.398416831491197, 'lambda': 0.09921256574801249, 'eta': 0.46293735614364534, 'colsample_bytree': 0.4444444444444444, 'colsample_bylevel': 0.4444444444444444, 'alpha': 0.021262343752724643}
{'nrounds': 1667.0, 'min_child_weight': 2.9394689845511977, 'lambda': 2.1601194777846118, 'eta': 0.09921256574801249, 'colsample_bytree': 0.2222222222222222, 'colsample_bylevel': 0.6666666666666666, 'alpha': 0.004556754060844206}
{'nrounds': 2222.0, 'min_child_weight': 14.813995396596646, 'lambda': 0.46293735614364534, 'eta': 0.46293735614364534, 'colsample_bytree': 0.8888888888888888, 'colsample_bylevel': 0.8888888888888888, 'alpha': 47.03150375281921}
{'nrounds': 556.0, 'min_child_weight': 74.65785853287147, 'lambda': 0.09921256574801249, 'eta': 0.09921256574801249, 'colsample_bytree': 0.5555555555555556, 'colsample_bylevel': 0.6666666666666666, 'alpha': 0.46293735614364534}
def get_task(path):
with open(path, 'rb') as f:
labels = pickle.load(f)
dct = {'mort':0, 'readmit': 1, 'los': 2, 'dx':3 }
task = [yy[dct['mort']] for yy in labels]
return np.array(task)
#Loading data
X = np.load("./local_mimic/save/X48.npy")
Z = np.load("./local_mimic/save/w2v.npy")
y = get_task("./local_mimic/save/y")
#Data transformations
X, Z, y = np.array(X), np.array(Z), np.array(y)
X = np.append(X, Z, axis=1)
X = pd.DataFrame(X)
### ast.literal_eval - zamiana stringa na dict
### ** po to aby xgboost wczytał podane parametry
xgb_1 = XGBClassifier(**ast.literal_eval(best_params[0]))
xgb_2 = XGBClassifier(**ast.literal_eval(best_params[1]))
xgb_3 = XGBClassifier(**ast.literal_eval(best_params[2]))
xgb_4 = XGBClassifier(**ast.literal_eval(best_params[3]))
xgb_5 = XGBClassifier(**ast.literal_eval(best_params[4]))
# widzimy ze xgboost przyjal paramatery ktore powinnien
print(best_params[4])
xgb_5
{'nrounds': 556.0, 'min_child_weight': 74.65785853287147, 'lambda': 0.09921256574801249, 'eta': 0.09921256574801249, 'colsample_bytree': 0.5555555555555556, 'colsample_bylevel': 0.6666666666666666, 'alpha': 0.46293735614364534}
XGBClassifier(alpha=0.46293735614364534, colsample_bylevel=0.6666666666666666,
colsample_bytree=0.5555555555555556, eta=0.09921256574801249,
lambda=0.09921256574801249, min_child_weight=74.65785853287147,
nrounds=556.0)
models = [xgb_1, xgb_2, xgb_3, xgb_4, xgb_5]
for model in models:
model.fit(X,y)
xgb_1_exp = dx.Explainer(xgb_1,
X, y, label = "XGB 1")
xgb_2_exp = dx.Explainer(xgb_2,
X, y, label = "XGB 2")
xgb_3_exp = dx.Explainer(xgb_3,
X, y, label = "XGB 3")
xgb_4_exp = dx.Explainer(xgb_4,
X, y, label = "XGB 4")
xgb_5_exp = dx.Explainer(xgb_5,
X, y, label = "XGB 5")
Preparation of a new explainer is initiated -> data : 27616 rows 276 cols -> target variable : 27616 values -> model_class : xgboost.sklearn.XGBClassifier (default) -> label : XGB 1 -> predict function : <function yhat_proba_default at 0x0000021245ED37B8> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.00176, mean = 0.119, max = 0.965 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.883, mean = -9.13e-05, max = 0.989 -> model_info : package xgboost A new explainer has been created! Preparation of a new explainer is initiated -> data : 27616 rows 276 cols -> target variable : 27616 values -> model_class : xgboost.sklearn.XGBClassifier (default) -> label : XGB 2 -> predict function : <function yhat_proba_default at 0x0000021245ED37B8> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.00158, mean = 0.119, max = 0.966 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.85, mean = -8.55e-05, max = 0.99 -> model_info : package xgboost A new explainer has been created! Preparation of a new explainer is initiated -> data : 27616 rows 276 cols -> target variable : 27616 values -> model_class : xgboost.sklearn.XGBClassifier (default) -> label : XGB 3 -> predict function : <function yhat_proba_default at 0x0000021245ED37B8> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.00162, mean = 0.119, max = 0.982 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.853, mean = -6.63e-05, max = 0.988 -> model_info : package xgboost A new explainer has been created! Preparation of a new explainer is initiated -> data : 27616 rows 276 cols -> target variable : 27616 values -> model_class : xgboost.sklearn.XGBClassifier (default) -> label : XGB 4 -> predict function : <function yhat_proba_default at 0x0000021245ED37B8> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.00157, mean = 0.119, max = 0.985 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.832, mean = -6.02e-05, max = 0.991 -> model_info : package xgboost A new explainer has been created! Preparation of a new explainer is initiated -> data : 27616 rows 276 cols -> target variable : 27616 values -> model_class : xgboost.sklearn.XGBClassifier (default) -> label : XGB 5 -> predict function : <function yhat_proba_default at 0x0000021245ED37B8> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.0018, mean = 0.119, max = 0.953 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.893, mean = -9.38e-05, max = 0.991 -> model_info : package xgboost A new explainer has been created!
vi_xgb_1 = xgb_1_exp.model_parts()
vi_xgb_2 = xgb_2_exp.model_parts()
vi_xgb_3 = xgb_3_exp.model_parts()
vi_xgb_4 = xgb_4_exp.model_parts()
vi_xgb_5 = xgb_5_exp.model_parts()
vi_xgb_1.plot([vi_xgb_2, vi_xgb_3, vi_xgb_4, vi_xgb_5])
def get_k_best_variables(vi_model, k):
return set(vi_model.result.sort_values(by = 'dropout_loss', ascending = False)[1:k].variable)
important_variables = set.intersection(get_k_best_variables(vi_xgb_1, 5),
get_k_best_variables(vi_xgb_2, 5),
get_k_best_variables(vi_xgb_3, 5),
get_k_best_variables(vi_xgb_4, 5),
get_k_best_variables(vi_xgb_5, 5))
important_variables
{211, 253}
pd_xgb_1 = xgb_1_exp.model_profile(variables = list(important_variables))
pd_xgb_2 = xgb_2_exp.model_profile(variables = list(important_variables))
pd_xgb_3 = xgb_3_exp.model_profile(variables = list(important_variables))
pd_xgb_4 = xgb_4_exp.model_profile(variables = list(important_variables))
pd_xgb_5 = xgb_5_exp.model_profile(variables = list(important_variables))
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 4.24it/s] Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3.19it/s] Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 4.23it/s] Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2.80it/s] Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3.51it/s]
pd_xgb_1.plot([pd_xgb_2, pd_xgb_3, pd_xgb_4, pd_xgb_5])
al_xgb_1 = xgb_1_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_2 = xgb_2_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_3 = xgb_3_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_4 = xgb_4_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_5 = xgb_5_exp.model_profile(variables = list(important_variables), type = 'accumulated')
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3.99it/s] Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 2/2 [00:00<00:00, 3.62it/s] Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 4.89it/s] Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 2/2 [00:00<00:00, 7.96it/s] Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 4.29it/s] Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 2/2 [00:00<00:00, 6.45it/s] Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 4.39it/s] Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 2/2 [00:00<00:00, 5.57it/s] Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 3.95it/s] Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 2/2 [00:01<00:00, 1.95it/s]
al_xgb_1.plot([al_xgb_2, al_xgb_3, al_xgb_4, al_xgb_5])
al_models = [al_xgb_1, al_xgb_2, al_xgb_3, al_xgb_4, al_xgb_5]
pd_models = [pd_xgb_1, pd_xgb_2, pd_xgb_3, pd_xgb_4, pd_xgb_5]
for al_model in al_models:
al_model.result['_label_'] = [x + ' AL profiles' for x in al_model.result['_label_']]
for i in range(5):
al_models[i].plot(pd_models[i])
params = ['nrounds', 'min_child_weight', 'lambda', 'eta', 'colsample_bytree', 'colsample_bylevel', 'alpha']
def plot_one_param(df, param, metrics):
fig, axs = plt.subplots(1, 3, sharey=True, figsize=(8,4))
for i in range(3):
sns.lineplot(ax = axs[i], data = df.groupby('param_'+param).mean(), x='param_'+param, y = 'mean_test_' + metrics[i])
axs[i].set_ylabel('mean test score', fontsize = 14)
axs[i].set_xlabel("")
if i==1:
axs[i].set_xlabel("param_"+param, fontsize = 12)
axs[i].set_title(metrics[i])
for param in params:
plot_one_param(clf_results_df, param, used_metrics)